setwd("~/Dropbox/github/art-data-science/notebook/onion")
library(rvest)
## Loading required package: xml2
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(stringr)
library(ggplot2)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(prophet)
## Loading required package: Rcpp
library(ggmap)
##
## Attaching package: 'ggmap'
## The following object is masked from 'package:plotly':
##
## wind
pg.out <- read_html("MonthWiseMarketArrivalsNew.htm")
pg.table <- pg.out %>%
html_node("#dnn_ctr974_MonthWiseMarketArrivals_GridView1") %>%
html_table()
str(pg.table)
## 'data.frame': 3784 obs. of 7 variables:
## $ Market : chr "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
## $ Month Name : chr "January" "January" "January" "February" ...
## $ Year : chr "2014" "2015" "2017" "2014" ...
## $ Arrival (q) : int 440 1305 200 1115 1115 1300 920 670 1350 940 ...
## $ Price Minimum (Rs/q): chr "1025" "1309" "750" "831" ...
## $ Price Maximum (Rs/q): chr "1481" "1858" "1000" "1163" ...
## $ Modal Price (Rs/q) : chr "1256" "1613" "850" "983" ...
df <- pg.table
dim(df)
## [1] 3784 7
column_names <- c('market', 'month', 'year', 'quantity', 'priceMin', 'priceMax', 'priceMod')
colnames(df) <- column_names
head(df)
## market month year quantity priceMin priceMax priceMod
## 1 ABOHAR(PB) January 2014 440 1025 1481 1256
## 2 ABOHAR(PB) January 2015 1305 1309 1858 1613
## 3 ABOHAR(PB) January 2017 200 750 1000 850
## 4 ABOHAR(PB) February 2014 1115 831 1163 983
## 5 ABOHAR(PB) February 2015 1115 1200 1946 1688
## 6 ABOHAR(PB) March 2014 1300 844 1181 994
tail(df)
## market month year quantity priceMin priceMax priceMod
## 3779 YEOLA(MS) November 2015 62522 1003 2460 1996
## 3780 YEOLA(MS) November 2016 45954 207 780 572
## 3781 YEOLA(MS) December 2014 201077 446 1654 1456
## 3782 YEOLA(MS) December 2015 223315 609 1446 1126
## 3783 YEOLA(MS) December 2016 214937 256 753 634
## 3784 Total 268749947 841(Avg) 1544(Avg) 1249(Avg)
df <- df %>%
filter(year != "Total")
dim(df)
## [1] 3783 7
df$quantity <- as.numeric(df$quantity)
df$year <- as.numeric(df$year)
df$priceMin <- as.numeric(df$priceMin)
df$priceMax <- as.numeric(df$priceMax)
df$priceMod <- as.numeric(df$priceMod)
str(df)
## 'data.frame': 3783 obs. of 7 variables:
## $ market : chr "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
## $ month : chr "January" "January" "January" "February" ...
## $ year : num 2014 2015 2017 2014 2015 ...
## $ quantity: num 440 1305 200 1115 1115 ...
## $ priceMin: num 1025 1309 750 831 1200 ...
## $ priceMax: num 1481 1858 1000 1163 1946 ...
## $ priceMod: num 1256 1613 850 983 1688 ...
df <- df %>%
mutate(market1 = market) %>%
separate(market1, c("city", "state"), sep = "\\(")
## Warning: Too many values at 99 locations: 840, 841, 842, 843, 844, 845,
## 846, 847, 848, 849, 850, 851, 852, 853, 854, 855, 856, 857, 858, 859, ...
## Warning: Too few values at 535 locations: 351, 352, 353, 354, 355, 356,
## 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, ...
df$state <- df$state %>% str_replace("\\)","")
df <- df %>%
mutate(state = ifelse(is.na(state), market, state))
head(df)
## market month year quantity priceMin priceMax priceMod city
## 1 ABOHAR(PB) January 2014 440 1025 1481 1256 ABOHAR
## 2 ABOHAR(PB) January 2015 1305 1309 1858 1613 ABOHAR
## 3 ABOHAR(PB) January 2017 200 750 1000 850 ABOHAR
## 4 ABOHAR(PB) February 2014 1115 831 1163 983 ABOHAR
## 5 ABOHAR(PB) February 2015 1115 1200 1946 1688 ABOHAR
## 6 ABOHAR(PB) March 2014 1300 844 1181 994 ABOHAR
## state
## 1 PB
## 2 PB
## 3 PB
## 4 PB
## 5 PB
## 6 PB
df <- df %>%
mutate(date = paste(month, year, sep="-"))
df$date = as.Date(paste("01-",df$date,sep=""), "%d-%B-%Y")
str(df)
## 'data.frame': 3783 obs. of 10 variables:
## $ market : chr "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" "ABOHAR(PB)" ...
## $ month : chr "January" "January" "January" "February" ...
## $ year : num 2014 2015 2017 2014 2015 ...
## $ quantity: num 440 1305 200 1115 1115 ...
## $ priceMin: num 1025 1309 750 831 1200 ...
## $ priceMax: num 1481 1858 1000 1163 1946 ...
## $ priceMod: num 1256 1613 850 983 1688 ...
## $ city : chr "ABOHAR" "ABOHAR" "ABOHAR" "ABOHAR" ...
## $ state : chr "PB" "PB" "PB" "PB" ...
## $ date : Date, format: "2014-01-01" "2015-01-01" ...
df2016 <- df %>%
filter(year == 2016)
head(df2016)
## market month year quantity priceMin priceMax priceMod city
## 1 ABOHAR(PB) June 2016 710 523 755 613 ABOHAR
## 2 ABOHAR(PB) August 2016 60 650 1065 850 ABOHAR
## 3 ABOHAR(PB) September 2016 400 650 800 725 ABOHAR
## 4 ABOHAR(PB) October 2016 350 700 900 850 ABOHAR
## 5 ABOHAR(PB) November 2016 590 775 1059 888 ABOHAR
## 6 ABOHAR(PB) December 2016 330 688 1100 873 ABOHAR
## state date
## 1 PB 2016-06-01
## 2 PB 2016-08-01
## 3 PB 2016-09-01
## 4 PB 2016-10-01
## 5 PB 2016-11-01
## 6 PB 2016-12-01
Split-Apply-Combine
df2016City <- df %>%
filter(year == 2016) %>%
group_by(city) %>%
summarise(quantity_year = sum(quantity)) %>%
arrange(desc(quantity_year)) %>%
filter(quantity_year > 2500000)
head(df2016City)
## # A tibble: 6 × 2
## city quantity_year
## <chr> <dbl>
## 1 BANGALORE 9117473
## 2 MAHUVA 6170912
## 3 PIMPALGAON 3836046
## 4 SOLAPUR 3767140
## 5 LASALGAON 3402394
## 6 PUNE 3339194
ggplot(df2016City) +
aes(reorder(city, quantity_year),weight = quantity_year) +
geom_bar() +
coord_flip()
cities <- unique(df2016City$city)
cities
## [1] "BANGALORE" "MAHUVA" "PIMPALGAON" "SOLAPUR" "LASALGAON"
## [6] "PUNE" "DELHI" "NEWASA" "MUMBAI"
dfCity <- df %>%
filter( city %in% cities)
dim(dfCity)
## [1] 348 10
ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()
g <- ggplot(dfCity) + aes(date, priceMod, color=city) + geom_line()
ggplotly(g)
dfCityTall <- dfCity %>%
gather("priceType", "priceValue",5:7) %>%
arrange(date)
ggplot(dfCityTall) + aes(date, y = priceValue, color = priceType) + geom_line() + facet_wrap(~city)
dfBang <- df %>%
filter(city == "BANGALORE") %>%
select(date, priceMod) %>%
arrange(date)
ggplot(dfBang) + aes(date, priceMod) + geom_line()
colnames(dfBang) <- c('ds', 'y')
str(dfBang)
## 'data.frame': 40 obs. of 2 variables:
## $ ds: Date, format: "2014-01-01" "2014-02-01" ...
## $ y : num 1094 797 748 712 941 ...
m <- prophet(dfBang)
## Warning in set_auto_seasonalities(m): Disabling weekly seasonality. Run
## prophet with `weekly.seasonality=TRUE` to override this.
## Initial log joint probability = -3.13771
## Optimization terminated normally:
## Convergence detected: absolute parameter change was below tolerance
future <- make_future_dataframe(m, periods = 12)
tail(future)
## ds
## 47 2017-04-08
## 48 2017-04-09
## 49 2017-04-10
## 50 2017-04-11
## 51 2017-04-12
## 52 2017-04-13
forecast <- predict(m, future)
tail(forecast[c('ds', 'yhat', 'yhat_lower', 'yhat_upper')])
## ds yhat yhat_lower yhat_upper
## 47 2017-04-08 738.8072 144.3805 1377.286
## 48 2017-04-09 754.8498 121.4589 1359.882
## 49 2017-04-10 768.9300 158.8456 1344.891
## 50 2017-04-11 780.7612 156.1110 1436.928
## 51 2017-04-12 790.1263 177.7936 1391.326
## 52 2017-04-13 796.8850 191.5294 1396.898
plot(m, forecast)
prophet_plot_components(m, forecast)
uniqcity <- unique(dfCity$city)
geo <- geocode(uniqcity)
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=BANGALORE&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=DELHI&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=LASALGAON&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MAHUVA&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=MUMBAI&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=NEWASA&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PIMPALGAON&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=PUNE&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=SOLAPUR&sensor=false
dfGeo <- bind_cols(df2016City, geo)
dfGeo
## # A tibble: 9 × 4
## city quantity_year lon lat
## <chr> <dbl> <dbl> <dbl>
## 1 BANGALORE 9117473 77.59456 12.97160
## 2 MAHUVA 6170912 77.10249 28.70406
## 3 PIMPALGAON 3836046 74.23261 20.14914
## 4 SOLAPUR 3767140 71.77046 21.09216
## 5 LASALGAON 3402394 72.87766 19.07598
## 6 PUNE 3339194 74.92811 19.55118
## 7 DELHI 3061788 73.98738 20.16997
## 8 NEWASA 2897566 73.85674 18.52043
## 9 MUMBAI 2872669 75.90639 17.65992
ggplot(dfGeo) + aes(lon, lat, size=quantity_year/1000) + geom_point() + coord_map()
map <- get_map("India", zoom = 5)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&language=en-EN&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
ggmap(map)
map1 <- get_map("India", maptype = "watercolor", source = "stamen", zoom = 5)
## Map from URL : http://maps.googleapis.com/maps/api/staticmap?center=India&zoom=5&size=640x640&scale=2&maptype=terrain&sensor=false
## Information from URL : http://maps.googleapis.com/maps/api/geocode/json?address=India&sensor=false
## Map from URL : http://tile.stamen.com/watercolor/5/21/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/12.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/13.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/14.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/21/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/22/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/23/15.jpg
## Map from URL : http://tile.stamen.com/watercolor/5/24/15.jpg
ggmap(map1)
ggmap(map1) + geom_point(data = dfGeo,aes(lon,lat,size=quantity_year/1000,color=city))